In [1]:
#%%
#os.chdir(r'F:\Github\6103-Final-Project')
In [2]:
import pandas as pd
import numpy as np
import pylab as py
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
def dfChkBasics(dframe, valCnt = False): 
  cnt = 1
  print('\ndataframe Basic Check function -')
  
  try:
    print(f'\n{cnt}: info(): ')
    cnt+=1
    print(dframe.info())
  except: pass

  print(f'\n{cnt}: describe(): ')
  cnt+=1
  print(dframe.describe())

  print(f'\n{cnt}: head() -- ')
  cnt+=1
  print(dframe.head())

  print(f'\n{cnt}: shape: ')
  cnt+=1
  print(dframe.shape)

  if (valCnt):
    print('\nValue Counts for each feature -')
    for colname in dframe.columns :
      print(f'\n{cnt}: {colname} value_counts(): ')
      print(dframe[colname].value_counts())
      cnt +=1
In [4]:
# Dataset
# The data in this project contains 31 variables and more than 200,000 used cars' data. 
# 15 variables are anonymous we will drop them later. 
# 150,000 observations will be train set and 50,000 observation will be test set.
In [5]:
df=pd.read_csv("used_car_train_20200313.csv")
# Origin dataset is too large, so we only research on sample have size 20000.
df=df.sample(n=20000,random_state=2020)
In [6]:
# Read Data
#
# Drop meaningless variables
df=df.iloc[:,:16]
df=df.drop(columns='name')
In [7]:
# Check Data
dfChkBasics(df, valCnt= True)
dataframe Basic Check function -

1: info(): 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 143572 to 138606
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SaleID             20000 non-null  int64  
 1   regDate            20000 non-null  int64  
 2   model              20000 non-null  int64  
 3   brand              20000 non-null  int64  
 4   bodyType           20000 non-null  int64  
 5   fuelType           20000 non-null  float64
 6   gearbox            20000 non-null  object 
 7   power              20000 non-null  object 
 8   kilometer          20000 non-null  object 
 9   notRepairedDamage  20000 non-null  object 
 10  regionCode         20000 non-null  int64  
 11  seller             20000 non-null  int64  
 12  offerType          20000 non-null  float64
 13  creatDate          20000 non-null  float64
 14  price              20000 non-null  float64
dtypes: float64(4), int64(7), object(4)
memory usage: 2.4+ MB
None

2: describe(): 
              SaleID       regDate         model         brand      bodyType  \
count   20000.000000  2.000000e+04  20000.000000  20000.000000  20000.000000   
mean    75513.315150  2.003381e+07     47.727300      8.094150      1.846000   
std     43180.721509  5.365699e+04     49.676568      7.848335      3.763083   
min         3.000000  1.991000e+07      0.000000      0.000000      0.000000   
25%     38595.750000  1.999091e+07     11.000000      1.000000      0.000000   
50%     75645.000000  2.003090e+07     30.000000      6.000000      1.000000   
75%    113090.750000  2.007110e+07     66.000000     13.000000      3.000000   
max    149996.000000  2.015121e+07    247.000000     39.000000    156.000000   

           fuelType    regionCode        seller     offerType     creatDate  \
count  20000.000000  2.000000e+04  2.000000e+04  2.000000e+04  2.000000e+04   
mean       1.390075  2.150097e+05  2.812609e+05  1.457643e+06  1.820903e+07   
std       13.485880  2.059593e+06  2.364529e+06  5.221320e+06  5.960597e+06   
min        0.000000  0.000000e+00  0.000000e+00  0.000000e+00 -4.169894e+00   
25%        0.000000  7.200000e+02  0.000000e+00  0.000000e+00  2.016031e+07   
50%        0.000000  1.992000e+03  0.000000e+00  0.000000e+00  2.016032e+07   
75%        1.000000  3.672000e+03  0.000000e+00  0.000000e+00  2.016033e+07   
max     1103.000000  2.016041e+07  2.016040e+07  2.016041e+07  2.016041e+07   

              price  
count  20000.000000  
mean    5643.327486  
std     7507.319472  
min       -3.902379  
25%      999.000000  
50%     2950.000000  
75%     7490.000000  
max    99900.000000  

3: head() -- 
        SaleID   regDate  model  brand  bodyType  fuelType gearbox power  \
143572  143572  20030007    180     13         3       0.0     128    15   
82758    82758  20080207     64     21         1       2.0       0    67   
3479      3479  20040611     13      4         0       1.0       1   218   
89329    89329  20070606     26     14         4       0.0       0   140   
90675    90675  20020112     88     14         1       0.0       0    74   

       kilometer notRepairedDamage  regionCode  seller   offerType  \
143572         1              6550           0       0  20160312.0   
82758         15                 0        1696       0         0.0   
3479          15                 0        1844       0         0.0   
89329          9                 0        6864       0         0.0   
90675         15                 0        3545       0         0.0   

         creatDate        price  
143572      1000.0    43.102277  
82758   20160312.0  1800.000000  
3479    20160328.0  6700.000000  
89329   20160403.0  6500.000000  
90675   20160404.0  1990.000000  

4: shape: 
(20000, 15)

Value Counts for each feature -

5: SaleID value_counts(): 
67583     1
45619     1
78823     1
79767     1
121390    1
         ..
11583     1
54592     1
89409     1
35309     1
65536     1
Name: SaleID, Length: 20000, dtype: int64

6: regDate value_counts(): 
20000001    29
20000008    27
20000002    26
20000010    24
20000004    24
            ..
20130911     1
19920201     1
19941011     1
19920507     1
20070205     1
Name: regDate, Length: 3558, dtype: int64

7: model value_counts(): 
0      1568
19     1250
4      1104
1       804
29      690
       ... 
216       1
235       1
236       1
237       1
231       1
Name: model, Length: 242, dtype: int64

8: brand value_counts(): 
0     4172
4     2227
14    2202
10    1922
1     1820
6     1357
9      965
5      586
13     498
11     384
3      336
16     304
7      298
8      293
25     283
27     272
21     193
19     187
15     187
20     174
22     163
12     156
26     128
17     126
30     125
24     111
28      95
32      83
29      52
2       49
31      42
18      38
37      38
34      30
33      30
36      26
23      22
35      19
38       6
39       1
Name: brand, dtype: int64

9: bodyType value_counts(): 
0      6076
1      4776
2      4021
3      1757
4      1228
5      1067
6       884
7       169
60        4
150       3
55        1
71        1
156       1
56        1
101       1
136       1
100       1
68        1
89        1
90        1
75        1
65        1
140       1
80        1
125       1
Name: bodyType, dtype: int64

10: fuelType value_counts(): 
0.0      12928
1.0       6356
2.0        296
15.0       117
0.5         44
         ...  
131.0        1
68.0         1
155.0        1
65.0         1
87.0         1
Name: fuelType, Length: 79, dtype: int64

11: gearbox value_counts(): 
0      14306
1       4242
-        197
15       197
75        90
       ...  
439        1
367        1
245        1
333        1
286        1
Name: gearbox, Length: 158, dtype: int64

12: power value_counts(): 
75      1173
0       1114
15      1067
150      822
140      764
        ... 
2623       1
3062       1
466        1
2154       1
950        1
Name: power, Length: 511, dtype: int64

13: kilometer value_counts(): 
15      11468
12.5     1920
0         904
10        777
9         697
        ...  
6378        1
4329        1
480         1
2013        1
538         1
Name: kilometer, Length: 280, dtype: int64

14: notRepairedDamage value_counts(): 
0       14454
-        2455
1        1645
72          9
486         7
        ...  
4046        1
991         1
5936        1
4444        1
6518        1
Name: notRepairedDamage, Length: 1170, dtype: int64

15: regionCode value_counts(): 
0       1729
419       45
764       32
125       25
450       22
        ... 
2150       1
2134       1
3887       1
4149       1
5925       1
Name: regionCode, Length: 5482, dtype: int64

16: seller value_counts(): 
0           19510
20160310       17
20160331       16
20160401       14
20160314       12
            ...  
5800            1
2100            1
75              1
699             1
1999            1
Name: seller, Length: 134, dtype: int64

17: offerType value_counts(): 
0.000000e+00    18064
2.016031e+07       66
2.016032e+07       63
2.016031e+07       59
2.016032e+07       59
                ...  
9.700000e+03        1
5.600000e+03        1
7.990000e+03        1
3.545805e+01        1
6.399000e+03        1
Name: offerType, Length: 380, dtype: int64

18: creatDate value_counts(): 
 2.016040e+07    724
 2.016031e+07    683
 2.016040e+07    659
 2.016033e+07    658
 2.016032e+07    646
                ... 
 9.490000e+02      1
-3.460437e+00      1
-3.874060e+00      1
 4.667299e+01      1
 1.753440e+00      1
Name: creatDate, Length: 893, dtype: int64

19: price value_counts(): 
 500.000000      284
 1500.000000     249
 2500.000000     230
 1000.000000     205
 3500.000000     203
                ... 
 1720.000000       1
 755.000000        1
 9870.000000       1
 10150.000000      1
-3.239698          1
Name: price, Length: 3489, dtype: int64
In [8]:
# Q1 Car Attribute
# Research the influence of a vehicle’s 
# Brand, Model, Body Type, Fuel Type and Gearbox Type
# on the value of the vehicle.
dfQ1 = df.iloc[:,[0,1,2,3,4,5,6,14]]
dfQ1.set_index('SaleID')
dfQ1.replace('-',np.nan,inplace=True)
dfQ1=dfQ1.dropna()

#Change datatype
dfQ1["regDate"] = dfQ1["regDate"].astype('float')
dfQ1["model"] = dfQ1["model"].astype('float')
dfQ1["bodyType"] = dfQ1["bodyType"].astype('int')
dfQ1["fuelType"] = dfQ1["fuelType"].astype('int')
dfQ1['gearbox'] = dfQ1['gearbox'].astype('float')

dfQ1 = dfQ1[(dfQ1['gearbox'] == 0) | (dfQ1['gearbox'] == 1)]
dfQ1['gearbox'] = dfQ1['gearbox'].astype('int')
dfQ1 = dfQ1[dfQ1['fuelType']<=6]
dfQ1 = dfQ1[dfQ1['bodyType']<=7]
dfQ1=dfQ1.drop(dfQ1[dfQ1['price']==float].index)


#Delete outliers
dfQ1['model'] = dfQ1['model'][dfQ1['model'].between(dfQ1['model'].quantile(.025), dfQ1['model'].quantile(.975))]
dfQ1['brand'] = dfQ1['brand'][dfQ1['brand'].between(dfQ1['brand'].quantile(.025), dfQ1['brand'].quantile(.975))]
dfQ1['price'] = dfQ1['price'][dfQ1['price'].between(dfQ1['price'].quantile(.025), dfQ1['price'].quantile(.975))]
dfQ1=dfQ1.dropna()

dfQ1["brand"] = dfQ1["brand"].astype('int')

dfChkBasics(dfQ1)
dataframe Basic Check function -

1: info(): 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16883 entries, 82758 to 138606
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SaleID    16883 non-null  int64  
 1   regDate   16883 non-null  float64
 2   model     16883 non-null  float64
 3   brand     16883 non-null  int32  
 4   bodyType  16883 non-null  int32  
 5   fuelType  16883 non-null  int32  
 6   gearbox   16883 non-null  int32  
 7   price     16883 non-null  float64
dtypes: float64(3), int32(4), int64(1)
memory usage: 923.3 KB
None

2: describe(): 
              SaleID       regDate         model         brand      bodyType  \
count   16883.000000  1.688300e+04  16883.000000  16883.000000  16883.000000   
mean    75506.328318  2.003354e+07     43.425458      7.352603      1.730024   
std     43187.916740  5.269620e+04     42.488425      6.856009      1.717652   
min         3.000000  1.991000e+07      0.000000      0.000000      0.000000   
25%     38592.500000  1.999091e+07     11.000000      1.000000      0.000000   
50%     75699.000000  2.003090e+07     30.000000      6.000000      1.000000   
75%    113079.000000  2.007101e+07     65.000000     12.000000      3.000000   
max    149996.000000  2.015120e+07    182.000000     28.000000      7.000000   

           fuelType       gearbox         price  
count  16883.000000  16883.000000  16883.000000  
mean       0.369899      0.219155   5423.949454  
std        0.543073      0.413686   5520.179605  
min        0.000000      0.000000     46.857850  
25%        0.000000      0.000000   1400.000000  
50%        0.000000      0.000000   3400.000000  
75%        1.000000      0.000000   7600.000000  
max        6.000000      1.000000  26900.000000  

3: head() -- 
        SaleID     regDate  model  brand  bodyType  fuelType  gearbox   price
82758    82758  20080207.0   64.0     21         1         2        0  1800.0
3479      3479  20040611.0   13.0      4         0         1        1  6700.0
89329    89329  20070606.0   26.0     14         4         0        0  6500.0
90675    90675  20020112.0   88.0     14         1         0        0  1990.0
137169  137169  20060811.0   22.0      9         1         0        0  3100.0

4: shape: 
(16883, 8)
In [9]:
#QQPlot    

# QQPlot of Brand
sm.qqplot(dfQ1['brand'],fit=True,line='45') 
plt.title("Brand")
py.show()
# QQPlot of bodyType
sm.qqplot(dfQ1['bodyType'],fit=True,line='45') 
plt.title("BodyType")
py.show()
# QQPlot of fuelType
sm.qqplot(dfQ1['fuelType'],fit=True,line='45') 
plt.title("FuelType")
py.show()
# QQPlot of price
sm.qqplot(dfQ1['price'],fit=True,line='45') 
plt.title("Price")
py.show()
In [10]:
# Histogram
#
In [11]:
a=dfQ1.groupby('brand').count().reset_index('brand')
sns.barplot(x=a['brand'],y=a['price'])
plt.xlabel("Brand")
plt.ylabel("Frequency")
plt.title("Brand Histogram")
plt.show()

a=dfQ1.groupby('bodyType').count().reset_index('bodyType')
sns.barplot(x=a['bodyType'],y=a['price'])
plt.xlabel("BodyType")
plt.ylabel("Frequency")
plt.title("BodyType Histogram")
plt.show()

a=dfQ1.groupby('fuelType').count().reset_index('fuelType')
sns.barplot(x=a['fuelType'],y=a['price'])
plt.xlabel("FuelType")
plt.ylabel("Frequency")
plt.title("FuelType Histogram")
plt.show()

plt.hist(dfQ1['price'], bins='auto')
plt.xlabel("Price")
plt.ylabel("Frequency")
plt.title("Price Histogram")
plt.show()
In [12]:
# Boxplot

dfQ1['brand'].plot(kind='box')
plt.show()

dfQ1['bodyType'].plot(kind='box')
plt.show()

dfQ1['fuelType'].plot(kind='box')
plt.show()

dfQ1['price'].plot(kind='box')
plt.show()
In [13]:
# Visualization
# scatter plot
#
#relationship between bodytype and price
fuzzybody= dfQ1['bodyType'] + np.random.normal(0,0.75, size=len(dfQ1['bodyType']))
plt.scatter(fuzzybody, dfQ1.price, alpha = 0.1)
plt.ylabel("Price")
plt.xlabel("BodyType")
plt.title("BodyType")
plt.show()

#relationship between fueltype and price
fuzzyfuel= dfQ1['fuelType'] + np.random.normal(0,0.35, size=len(dfQ1['fuelType']))
plt.scatter(fuzzyfuel, dfQ1.price, alpha = 0.1)
plt.ylabel("Price")
plt.xlabel("FuelType")
plt.title("FuelType")
plt.show()
In [14]:
# Barplot
# relationship between brand and price
grouped=dfQ1.groupby('brand')
g1=grouped['price'].mean().reset_index('brand')
sns.barplot(x=dfQ1['brand'],y=dfQ1['price'])
plt.show()

# relationship between bodyType and price
grouped=dfQ1.groupby('bodyType')
g1=grouped['price'].mean().reset_index('bodyType')
sns.barplot(x=dfQ1['bodyType'],y=dfQ1['price'])
plt.xlim(-0.5,7.5)
plt.show()

# relationship between fuelType and price
grouped=dfQ1.groupby('fuelType')
g1=grouped['price'].mean().reset_index('fuelType')
sns.barplot(x=dfQ1['fuelType'],y=dfQ1['price'])
plt.xlim(-0.5,6.5)
plt.show()

#relationship between gearbox and price
grouped=dfQ1.groupby('gearbox')
g1=grouped['price'].mean().reset_index('gearbox')
sns.barplot(x=dfQ1['gearbox'],y=dfQ1['price'])
plt.xlim(-0.5,3)
plt.show()
In [15]:
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
In [16]:
#Model

x1 = dfQ1[['brand', 'bodyType', 'fuelType', 'gearbox']]
y1 = dfQ1['price']
y1 = y1.astype('int')
print(x1.head())
print(type(x1))
print(y1.head())
print(type(y1))
x1.dtypes
y1.dtypes


x_train1, x_test1, y_train1, y_test1 = train_test_split(x1, y1, test_size = 0.25, random_state=2000)
full_split1 = linear_model.LinearRegression()
full_split1.fit(x_train1, y_train1)
full_split1.fit(x_train1, y_train1)
y_pred1 = full_split1.predict(x_test1)
full_split1.score(x_test1, y_test1)




print('score (train):', full_split1.score(x_train1, y_train1)) 
print('score (test):', full_split1.score(x_test1, y_test1))
print('intercept:', full_split1.intercept_)
print('coef_:', full_split1.coef_)
        brand  bodyType  fuelType  gearbox
82758      21         1         2        0
3479        4         0         1        1
89329      14         4         0        0
90675      14         1         0        0
137169      9         1         0        0
<class 'pandas.core.frame.DataFrame'>
82758     1800
3479      6700
89329     6500
90675     1990
137169    3100
Name: price, dtype: int32
<class 'pandas.core.series.Series'>
score (train): 0.17488780696193862
score (test): 0.17270091896842865
intercept: 3644.087171627535
coef_: [ -90.04033933  565.30333135 1928.84603791 3371.90048594]
In [17]:
#
# Classification
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
In [18]:
clf1 = SVC()
clf2 = LogisticRegression()
clf3 = DecisionTreeClassifier()
clf4 = KNeighborsClassifier(n_neighbors=3)
clf5 = linear_model.LinearRegression() 
classifiers = [clf1,clf2,clf3,clf4,clf5]
In [19]:
for c in classifiers:
    c.fit(x_train1,y_train1)
    print('\n%s\n'%(c))
    print(f'train score:  {c.score(x_train1,y_train1)}')
    print(f'test score:  {c.score(x_test1,y_test1)}')
SVC()

train score:  0.018480492813141684
test score:  0.013266998341625208

LogisticRegression()

train score:  0.021797504343705577
test score:  0.015636105188343994

DecisionTreeClassifier()

train score:  0.0771600063181172
test score:  0.016346837242359632

KNeighborsClassifier(n_neighbors=3)

train score:  0.04730690254304217
test score:  0.00923951670220327

LinearRegression()

train score:  0.17488780696193862
test score:  0.17270091896842865
In [20]:
for c in classifiers:
  print('\n%s\n'%(c))
  print(cross_val_score(c, x_test1, y_test1, cv= 10))
  print(f'CV mean:  {np.mean(cross_val_score(c, x_test1, y_test1, cv= 10))}')
SVC()

[0.0212766  0.01421801 0.01421801 0.01895735 0.00947867 0.01421801
 0.007109   0.00947867 0.01421801 0.02132701]
CV mean:  0.014449934455984673

LogisticRegression()

[0.01891253 0.01421801 0.01421801 0.01658768 0.007109   0.01895735
 0.01421801 0.02132701 0.02369668 0.01421801]
CV mean:  0.016346229258400277

DecisionTreeClassifier()

[0.00945626 0.007109   0.01184834 0.00947867 0.02369668 0.03080569
 0.01421801 0.01184834 0.01895735 0.01658768]
CV mean:  0.015400602780858908

KNeighborsClassifier(n_neighbors=3)

[0.00472813 0.         0.007109   0.01421801 0.00473934 0.01184834
 0.00473934 0.007109   0.01421801 0.01184834]
CV mean:  0.008055751627396277

LinearRegression()

[0.1908745  0.22260441 0.21923955 0.09570931 0.12202532 0.17513849
 0.17870348 0.17580575 0.12396148 0.16843828]
CV mean:  0.16725005547620966
In [21]:
#Choose Linear model
from statsmodels.formula.api import ols

lm = ols(formula='price ~ model + brand + bodyType + fuelType + C(gearbox)', data=dfQ1).fit()
print( lm.summary() )
np.mean(lm.predict(dfQ1))
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.183
Model:                            OLS   Adj. R-squared:                  0.183
Method:                 Least Squares   F-statistic:                     756.0
Date:                Fri, 04 Dec 2020   Prob (F-statistic):               0.00
Time:                        20:54:09   Log-Likelihood:            -1.6772e+05
No. Observations:               16883   AIC:                         3.354e+05
Df Residuals:                   16877   BIC:                         3.355e+05
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
===================================================================================
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept        3348.9107     75.768     44.200      0.000    3200.398    3497.423
C(gearbox)[T.1]  3371.4724     94.239     35.776      0.000    3186.755    3556.190
model              12.7846      0.962     13.290      0.000      10.899      14.670
brand            -114.8488      5.908    -19.440      0.000    -126.429    -103.269
bodyType          543.0716     22.889     23.726      0.000     498.206     587.937
fuelType         1854.2784     72.456     25.592      0.000    1712.257    1996.300
==============================================================================
Omnibus:                     4134.983   Durbin-Watson:                   2.019
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             9160.291
Skew:                           1.409   Prob(JB):                         0.00
Kurtosis:                       5.254   Cond. No.                         155.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Out[21]:
5423.9494539333555
In [22]:
#VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
dfQ1=dfQ1.iloc[:,[2,3,4,5,6]]
vif = pd.DataFrame()
vif["variables"] = dfQ1.columns
vif["VIF"] = [ variance_inflation_factor(dfQ1.values, i) for i in range(dfQ1.shape[1]) ]
print(vif)
#So lm is the final model
  variables       VIF
0     model  2.126201
1     brand  1.965964
2  bodyType  1.822497
3  fuelType  1.414146
4   gearbox  1.274188